In [3]:

    
import nltk
import pandas as pd

NTLK Documentation

Understanding how NTLK works



In [4]:

    
sentence = """At eight o'clock on Thursday morning
... Arthur didn't feel very good."""



In [5]:

    
tokens = nltk.word_tokenize(sentence)



In [6]:

    
tokens









    Out[6]:





['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']



In [7]:

    
tagged = nltk.pos_tag(tokens)



In [8]:

    
tagged









    Out[8]:





[('At', 'IN'),
 ('eight', 'CD'),
 ("o'clock", 'NN'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('morning', 'NN'),
 ('Arthur', 'NNP'),
 ('did', 'VBD'),
 ("n't", 'RB'),
 ('feel', 'VB'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('.', '.')]

Look at this list of tags (https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging)

So lets try to use this on out Daphne data



In [9]:

    
df = pd.read_csv('allPostText_test.csv')



In [10]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
Unnamed: 0      100 non-null int64
Unnamed: 0.1    100 non-null int64
Date_1          100 non-null int64
Date_2          100 non-null int64
Date_3          100 non-null object
ID_page         100 non-null int64
ID_post         100 non-null int64
Link            100 non-null object
Title           100 non-null object
Txt             89 non-null object
Text            100 non-null object
dtypes: int64(6), object(5)
memory usage: 8.7+ KB



In [11]:

    
def vec(name):
    tokens = nltk.word_tokenize(name)
    tagged = nltk.pos_tag(tokens)
    return tagged



In [12]:

    
df['Text'].apply(vec).head(10)









    Out[12]:





0    [(«, VB), (back, RB), (to, TO), (home, NN), (Y...
1    [(«, VB), (back, RB), (to, TO), (home, NN), (E...
2    [(«, VB), (back, RB), (to, TO), (home, NN), (I...
3    [(«, VB), (back, RB), (to, TO), (home, NN), (T...
4    [(«, VB), (back, RB), (to, TO), (home, NN), (“...
5    [(«, VB), (back, RB), (to, TO), (home, NN), («...
6    [(«, VB), (back, RB), (to, TO), (home, NN), (I...
7    [(«, VB), (back, RB), (to, TO), (home, NN), (T...
8    [(«, VB), (back, RB), (to, TO), (home, NN), («...
9    [(«, VB), (back, RB), (to, TO), (home, NN), (D...
Name: Text, dtype: object



In [13]:

    
df['Tags'] = df['Text'].apply(vec)

Only consider the ones, with NNP as second item



In [14]:

    
def token(tags):
    mini_list = []
    for elem in tags:
        if elem[1] == 'NNP':
            mini_list.append(elem[0])
    return mini_list



In [15]:

    
df['People list'] = df['Tags'].apply(token)



In [16]:

    
df['People list'].head(10)









    Out[16]:





0                                                   []
1    [Kurz, Austria’s, People’s, Party, OVP, Freedo...
2    [Naxxar, Labour, Party, Prime, Minister’s, Sun...
3    [Nationalist, Party, Nationalist, Party, Malta...
4                                              [“I, «]
5                                                  [«]
6                                                   []
7    [Toni, Bezzina, Nationalist, Party’s, MP, Robe...
8                                                  [«]
9    [David, Agius, Nationalist, Party’s, Edwin, Va...
Name: People list, dtype: object

Not great, so lets look around for a better solution.



In [17]:

    
# This looks promising

sudo python -m nltk.downloader all if you have problems

Code stolen from Stackoverflow



In [54]:

    
for sent in nltk.sent_tokenize(df['Text'][1]):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            print(chunk.label()+',', ' '.join(c[0] for c in chunk))









    



PERSON, Sebastian Kurz
ORGANIZATION, OVP
ORGANIZATION, Freedom Party
ORGANIZATION, FPÖ
ORGANIZATION, Social Democrats
ORGANIZATION, People’s Party
ORGANIZATION, People’s Party
GPE, Sebastian
PERSON, Kurz
PERSON, Christian Kern
ORGANIZATION, Social Democrats
PERSON, Kurz
GSP, Austria



In [61]:

    
def peopled(elem):
    mini_list = []
    for sent in nltk.sent_tokenize(elem):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                p = chunk.label(), ' '.join(c[0] for c in chunk)
                mini_list.append(p)
    return mini_list



In [63]:

    
df['people'] = df['Text'].apply(peopled)

Only getting the peolple



In [65]:

    
lst = list(df['people'])



In [71]:

    
lst = [x for x in lst if x !=[]]



In [72]:

    
flat_list = [item for sublist in lst for item in sublist]



In [74]:

    
name_list = []
for name in flat_list:
    if name[0] == 'PERSON':
        name_list.append(name[1])



In [79]:

    
pd.DataFrame(name_list)[0].value_counts()









    Out[79]:





Delia                   41
Adrian Delia            31
Malta                   14
Jean Pierre Debono      14
Debono                  12
Muscat                  10
Mrs Delia                8
Agius                    7
Rebecca Dimech           7
Joseph Muscat            7
David Agius              6
Anton Rea Cutajar        5
David                    5
Frank Portelli           5
Adrian Delia’s           5
Clyde Puli               4
Clyde                    4
Kristy Debono            4
Bundy                    4
Cutajar                  4
Robert Arrigo            4
Eddie Fenech Adami       3
Keith Schembri           3
Keith                    3
Kristy                   3
Borg Olivier             3
Edwin Vassallo           3
Andre Falzon             3
Fenech Adami             2
Kurt Farrugia            2
                        ..
Hubert Zammit            1
Bad                      1
Pasta Rummo              1
Toni Bezzina             1
Kevin Cassar             1
Censu L-Iswed            1
Rudyard                  1
Xaraban                  1
Mad                      1
Puli                     1
Alexander                1
Borg                     1
Malta here.The Times     1
Hang                     1
Mandalay                 1
Farrugia                 1
Censu                    1
Leonardo Fasoli          1
Manwel Dimech Street     1
Gozo                     1
Opposition               1
Chris Said               1
Rebecca                  1
Pierre                   1
Joseph Muscat.”          1
Joe                      1
Botox                    1
Beppe Fenech Adami       1
Gonzi                    1
Maze Pictures            1
Name: 0, Length: 171, dtype: int64